#Prerequisites. Task1: Install and import tidyverse pacakage.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Task2: Creating tibbles with as_tibble().

as_tibble(iris)

Task3: Creating a new tibble from individual vectors with tibble().

tibble(
  x= 1:5,
  y= 1,
  z= x^2 + y
)

Task4:

tb <- tibble(
  `:)` = "smile",
  ` ` = "space",
  `2000` = "number"
)
tb

Task5:

tribble(
  ~x, ~y, ~z,
  #--|--|----
  "a", 2, 3.6,
  "b", 1, 8.5
)

Task6:

tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)

Task7:

nycflights13::flights %>%
  print(n= 10, width = Inf)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
##  2        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
##  3        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
##  4       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
##  5       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
##  6        12 UA        1696 N39463  EWR    ORD        150      719     5     58
##  7        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
##  8       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
##  9        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
## 10         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
##    time_hour          
##    <dttm>             
##  1 2013-01-01 05:00:00
##  2 2013-01-01 05:00:00
##  3 2013-01-01 05:00:00
##  4 2013-01-01 05:00:00
##  5 2013-01-01 06:00:00
##  6 2013-01-01 05:00:00
##  7 2013-01-01 06:00:00
##  8 2013-01-01 06:00:00
##  9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ℹ 336,766 more rows

Task8:

nycflights13::flights %>%
  View()

#Subsetting. Task9:

df <- tibble(
  x = runif(5),
  y = rnorm(5)
)
# Extract by name
df$x
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
df[["x"]]
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
# Extract by position
df[[1]]
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293

Task10: Need to use the special placeholder to use these in a pipe.

df %>% .$x
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293
df %>% .[["x"]]
## [1] 0.1336431 0.2271860 0.1601787 0.6879047 0.3632293

Task11:

class(as.data.frame(tb))
## [1] "data.frame"

#Exercise.

as_tibble(mtcars)
df <- data.frame(abc =1, xyz = "a")
df$x
## [1] "a"
df[, "xyz"]
## [1] "a"
df[, c("abc", "xyz")]
# Assuming var <- "mpg"
as_tibble(mtcars)
var <- "mpg"
#column <- tibble %>% 
#  {.[[var]]}

Task12:

heights <- read_csv("heights.csv")
## Rows: 1192 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sex, race
## dbl (4): earn, height, ed, age
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task13:

read_csv("a,b,c
1,2,3
4,5,6")
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task14:

read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task15:

read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task16:

read_csv("1,2,3\n4,5,6", col_names = FALSE)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): X1, X2, X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task17:

read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task18:

read_csv("a,b,c\n1,2,.", na = ".")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## lgl (1): c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Parsing a vector. Task19:

str(parse_logical(c("TRUE", "FALSE", "NA")))
##  logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
##  int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
##  Date[1:2], format: "2010-01-01" "1979-10-14"

Task20:

parse_integer(c("1", "231", ".", "456"), na=".")
## [1]   1 231  NA 456

#If parsing fails, you’ll get a warning: Task21:

x<- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col               expected actual
##   3  -- no trailing characters abc   
##   4  -- no trailing characters 123.45

#The failures will be misssing in the output: Task22:

x
## [1] 123 345  NA  NA
## attr(,"problems")
## # A tibble: 2 × 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA no trailing characters abc   
## 2     4    NA no trailing characters 123.45

Task23:

problems(x)

#Numbers Task24:

parse_double("1.23")
## [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = "," ))
## [1] 1.23

Task24:

parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45

Task25:

#Used in America
parse_number("$123,456,789")
## [1] 123456789
#Used in many parts of Europe.
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
#Used in Switzerland.
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789

#Srings Task26:

charToRaw("Hadley")
## [1] 48 61 64 6c 65 79

Task27:

x1<- "El Ni\xf10 was particularly bad this year"
x2<- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
x1
## [1] "El Ni\xf10 was particularly bad this year"
x2
## [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"

Task28:

parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niñ0 was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
## [1] "こんにちは"

Task29:

guess_encoding(charToRaw(x1))
guess_encoding(charToRaw(x2))

Factors

Task30:

fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col           expected   actual
##   3  -- value in level set bananana
## [1] apple  banana <NA>  
## attr(,"problems")
## # A tibble: 1 × 4
##     row   col expected           actual  
##   <int> <int> <chr>              <chr>   
## 1     3    NA value in level set bananana
## Levels: apple banana

#Dates, date-times, and times Task31:

parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"

Task32:

parse_date("2010-10-01")
## [1] "2010-10-01"

Task33:

library(readr)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms

Task34:

parse_time("01:10am")
## 01:10:00
parse_time("20:10:01")
## 20:10:01

#Non-digits Task34:

parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"

Task35:

parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
## [1] "2015-01-01"

#Parsing a file. # Strategy Task36:

guess_parser("2010-10-01")
## [1] "date"
guess_parser("15:01")
## [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
guess_parser(c("1","5","9"))
## [1] "double"
guess_parser(c("12,352,561"))
## [1] "number"
str(parse_guess("2010-10-10"))
##  Date[1:1], format: "2010-10-10"

#Problems Task37:

challenge <- read_csv(readr_example("challenge.csv"))
## Rows: 2000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): x
## date (1): y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task38:

problems(challenge)

Task39:

tail(challenge)

Task40-

challenge <- read_csv(
  readr_example("challenge.csv"), 
  col_types = cols(
    x = col_double(),
    y = col_logical()
  )
)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
challenge <- read_csv(
  readr_example("challenge.csv"), 
  col_types = cols(
    x = col_double(),
    y = col_date()
  )
)
tail(challenge)

#Other strategies Task41-

challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
## Rows: 2000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): x
## date (1): y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task42-

challenge2

Task43-

challenge2 <- read_csv(readr_example("challenge.csv"), 
  col_types = cols(.default = col_character())
)

Task44-

df <- tribble(
  ~x,  ~y,
  "1", "1.21",
  "2", "2.32",
  "3", "4.56"
)
df

Task45-

type_convert(df)
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   x = col_double(),
##   y = col_double()
## )

#Writing to a file. Task46-

write_csv(challenge, "challenge.csv")

Task47-

challenge

Task48-

write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")
## Rows: 2000 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): x
## date (1): y
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Task49-

write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")

Task50-

#install.packages("feather")
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")

#Tidy data. Task51-

table1
table2
table3

Task52-

# Spread across two tibbles
table4a  # cases
table4b  # population

Task53:

# Compute rate per 10,000
table1 %>% 
  mutate(rate = cases / population * 10000)

Task54:

# Compute cases per year
table1 %>% 
  count(year, wt = cases)

Task55:

# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) + 
  geom_line(aes(group = country), colour = "grey50") + 
  geom_point(aes(colour = country))

Task56:

table4a

Task57:

table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")

Task58:

tidy4a <- table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining with `by = join_by(country, year)`

Task59:

table2

Task60:

table2 %>%
    pivot_wider(names_from = type, values_from = count)

#Separating and uniting. #Separate Task61:

table3

Task62:

table3 %>% 
  separate(rate, into = c("cases", "population"))

Task63:

table3 %>% 
  separate(rate, into = c("cases", "population"), sep = "/")

Task64:

table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)

Task65:

table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)

#Unite Task66:

table5 %>% 
  unite(new, century, year)

Task67:

table5 %>% 
  unite(new, century, year, sep = "")

#Missing values Task68:

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)

Task69:

stocks %>% 
  pivot_wider(names_from = year, values_from = return)

Task70:

stocks %>% 
  pivot_wider(names_from = year, values_from = return) %>% 
  pivot_longer(
    cols = c(`2015`, `2016`), 
    names_to = "year", 
    values_to = "return", 
    values_drop_na = TRUE
  )

Task71:

stocks %>% 
  complete(year, qtr)

Task72:

treatment <- tribble(
  ~ person,           ~ treatment, ~response,
  "Derrick Whitmore", 1,           7,
  NA,                 2,           10,
  NA,                 3,           9,
  "Katherine Burke",  1,           4
)

Task73:

treatment %>% 
  fill(person)

#Case Study

who

Task74:

who1 <- who %>% 
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  )
who1

Task75:

who1 %>% 
  count(key)

Task76:

who2 <- who1 %>% 
  mutate(key = stringr::str_replace(key, "newrel", "new_rel"))
who2

Task77:

who3 <- who2 %>% 
  separate(key, c("new", "type", "sexage"), sep = "_")
who3

Task78:

who3 %>% 
  count(new)

Task79:

who4 <- who3 %>% 
  select(-new, -iso2, -iso3)

Task80:

who5 <- who4 %>% 
  separate(sexage, c("sex", "age"), sep = 1)
who5

Task81:

who %>%
  pivot_longer(
    cols = new_sp_m014:newrel_f65, 
    names_to = "key", 
    values_to = "cases", 
    values_drop_na = TRUE
  ) %>% 
  mutate(
    key = stringr::str_replace(key, "newrel", "new_rel")
  ) %>%
  separate(key, c("new", "var", "sexage")) %>% 
  select(-new, -iso2, -iso3) %>% 
  separate(sexage, c("sex", "age"), sep = 1)